### "Allegiance, Ability, and Achievement in the American Civil War:
### Commander Traits and Battlefield Military Effectiveness"

### By Jeffrey B. Arnold, J. Tyson Chatagnier, and Gary E. Hollibaugh, Jr.

##### IMPUTATION CODE.
##### DO NOT RUN THIS FILE DIRECTLY.

### Load in packaged
library("tidyverse")
library("Amelia")

## Preprocessing
## Exclude the following battles:
### - Fort Sumter since it had no casualties and started the war,
### - Appomattox Court House since it is a complete surrender to end the war
### - Palmeto Rach (May 1865) since it is after Lee's surrender
EXCLUDED_BATTLES <- c("VA097", "TX005", "SC001")

# Identify battles fought in Confederate states:
CSA <- c("SC", "MS", "FL", "AL", "GA", "LA", "TX", "VA", "AR", "NC", "TN")


# military ranks
milranks <- read.csv(file.path(DIR_DATA, "ACH-military_ranks.csv"))

## Preprocess the battle data.
### - exclude the aforementioned battles and primarily naval battles
### - rename or edit variables to be backwards compatible with previous data
raw_battles <- read_csv(file.path(DIR_DATA, "ACH-battles_summary.csv"),
                        na = "",
                        col_types = cols(
                            .default = col_character(),
                            start_date = col_date(format = "%Y-%m-%d"),
                            end_date = col_date(format = "%Y-%m-%d"),
                            lat = col_double(),
                            long = col_double(),
                            duration = col_integer(),
                            mid_date = col_date(format = "%Y-%m-%d"),
                            day_of_war = col_double(),
                            year = col_integer(),
                            war_phase = col_integer(),
                            siege = col_logical(),
                            commander_navy_C = col_logical(),
                            commander_navy_U = col_logical(),
                            casualties_C = col_double(),
                            casualties_U = col_double(),
                            strength_C = col_double(),
                            strength_U = col_double()
                          )) %>%
  filter(!cwsac_id %in% EXCLUDED_BATTLES,
         naval != "Yes") %>%
  rename(outcome = result,
         theater = theater_code) %>%
  mutate(outcome = recode(outcome, "Inconclusive" = "Indecisive"),
         confed_battle = state %in% CSA) %>%
  dplyr::select(cwsac_id,
          casualties_U,
          strength_U,
          casualties_C,
          strength_C,
          outcome,
          surrender,
          significance,
          confed_battle,
          attacker,
          start_date,
          duration,
          theater,
          lat,
          long,
          naval,
          commander_rank_C,
          commander_rank_U,
          commander_navy_C,
          commander_navy_U)

# remove those where the highest-ranking commander was in the navy
raw_battles$commander_rank_C_number <- milranks$level[match(as.character(raw_battles$commander_rank_C), as.character(milranks$rank))]
raw_battles$commander_rank_U_number <- milranks$level[match(as.character(raw_battles$commander_rank_U), as.character(milranks$rank))]

raw_battles <- subset(raw_battles,
                      !((commander_rank_U_number < commander_rank_C_number & commander_navy_C == TRUE)|
                        (commander_rank_U_number > commander_rank_C_number & commander_navy_U == TRUE)),
                      select = -c(commander_rank_U, commander_rank_C,
                                  commander_navy_C, commander_navy_U))
## Imputation
# set bounds
bound.matrix <- cbind(grep("strength|casualties", colnames(raw_battles)), 0, Inf)

# Impute missing data using `Amelia`.

set.seed(1)

M <- 1000
imputed <- amelia(as.data.frame(raw_battles),
                  idvars = "cwsac_id",
                  m = M,
                  ords = c("commander_rank_C_number",
                           "commander_rank_U_number"),
                  noms = c("significance",
                           "outcome",
                           "theater",
                           "surrender",
                           "confed_battle",
                           "attacker",
                           "naval"),
                  sqrts = c("casualties_U",
                            "strength_U",
                            "casualties_C",
                            "strength_C",
                            "duration"),
                  empri = 0.0 * nrow(raw_battles),
                  ts = "start_date",
                  splinetime = 6,
                  p2s = 0)

# We'll simply use the geometric mean of the values of the imputed data
# rather than the imputations themselves.

# The following function is a utility function to calculate the
# geomeric mean since it isn't included in base R.
# This function also truncates values below or at zero since that wasn't done in `amelia`.
gmean <- function(x, na.rm = TRUE) {
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  # force all values to be non-negative
  x[x < 0] <- 0
  exp(mean(log(x)))
}

battles_imputed_mean <-
  imputed$imputations %>%
  rlang::as_list() %>%
  bind_rows() %>%
  dplyr::select(cwsac_id, matches("^(casualties|strength)_(U|C)$")) %>%
  group_by(cwsac_id) %>%
  summarise_all(funs(gmean))

battles_imputed_max <-
  imputed$imputations %>%
  rlang::as_list() %>%
  bind_rows() %>%
  dplyr::select(cwsac_id, matches("^(casualties|strength)_(U|C)$")) %>%
  group_by(cwsac_id) %>%
  summarise_all(funs(as.numeric(quantile(., 0.95, na.rm = TRUE))))

battles_imputed_min <-
  imputed$imputations %>%
  rlang::as_list() %>%
  bind_rows() %>%
  dplyr::select(cwsac_id, matches("^(casualties|strength)_(U|C)$")) %>%
  group_by(cwsac_id) %>%
  summarise_all(funs(as.numeric(quantile(., 0.05, na.rm = TRUE))))

colnames(battles_imputed_max)[-1] <- c("casualties_U_max", "strength_U_max",
                                       "casualties_C_max", "strength_C_max")

colnames(battles_imputed_min)[-1] <- c("casualties_U_min", "strength_U_min",
                                       "casualties_C_min", "strength_C_min")

battles_imputed <- merge(battles_imputed_mean, merge(battles_imputed_min, battles_imputed_max))

battles_imputed <- merge(battles_imputed,
                              subset(raw_battles, select = -grep("strength|casualties",
                                                                 colnames(raw_battles))))

## Save to incorporate into later analyses
saveRDS(battles_imputed, file.path(DIR_DATA, "ACH-BattlesImputed.rds"))